if (!require('xlsx')) install.packages('xlsx')
if (!require('fitdistrplus')) install.packages('fitdistrplus')
if (!require('dplyr')) install.packages('dplyr')
if (!require('ggplot2')) install.packages('ggplot2')
if (!require('gridExtra')) install.packages('gridExtra')We’ll use this a few times to test fit using the fitdistrplus packge, so let’s make a function.
GoodnessPlot <- function(vector, try_list, discrete=FALSE) {
# given a data distribution uses the fitdistrplus package to
# plot goodness of fits agains a list of distributions to try
#
# Args:
# vector: desired distribution
# try_list: list of distributions to test
#
# Returns:
# a series of plots showing:
# Density - closeness of fit for each distribution
# QQ - how do the tails match
# CDF - how do the cdfs match
# PP - how well does the center match
# a series of goodness of fit tests
# Cramer-von Mises, Kolmogorov-Smirnov and Anderson-Darling
plot.legend <- try_list
n <- length(try_list)
fit_list <- list()
for (i in 1:length(try_list)) {
name <- paste0("fit", i)
if (discrete) {
fit <- fitdist(vector, try_list[i], discrete=T)
} else {
fit <- fitdist(vector, try_list[i])
}
fit_list[[name]] <- fit
}
denscomp(fit_list, legendtext = plot.legend)
qqcomp(fit_list, legendtext = plot.legend)
cdfcomp(fit_list, legendtext = plot.legend)
ppcomp(fit_list, legendtext = plot.legend)
gofstat(fit_list)
}df1 <- read.xlsx('Problem_Dataset_06_01.xls', 1, header=F)
plotdist(df1$X1, histo = TRUE, demp = TRUE)descdist(df1$X1, boot=500)## summary statistics
## ------
## min: 2.06 max: 48.65
## median: 9.015
## mean: 11.92048
## estimated sd: 9.832445
## estimated skewness: 1.790431
## estimated kurtosis: 6.895655
try <- c("gamma", "weibull", "lnorm", "logis")
GoodnessPlot(df1$X1, try)## Goodness-of-fit statistics
## 1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Kolmogorov-Smirnov statistic 0.07461236 0.07934941 0.05044292
## Cramer-von Mises statistic 0.05174037 0.05847634 0.01825121
## Anderson-Darling statistic 0.35230811 0.44537170 0.14142579
## 4-mle-logis
## Kolmogorov-Smirnov statistic 0.1557695
## Cramer-von Mises statistic 0.1562512
## Anderson-Darling statistic 1.3210442
##
## Goodness-of-fit criteria
## 1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Akaike's Information Criterion 288.2076 290.3611 284.9146
## Bayesian Information Criterion 291.6830 293.8364 288.3900
## 4-mle-logis
## Akaike's Information Criterion 308.5568
## Bayesian Information Criterion 312.0321
It looks like the best option here is the Lognormal Distribution.
fln <- fitdist(df1$X1, "lnorm")
coef <- round(coef(fln), 4)
RV <- rlnorm(10^5, coef[1], coef[2])
hist(df1$X1, prob=TRUE, col="grey", ylim=c(0, .09), main="Density of Data vs Fit of Chosen Distribution")
lines(density(RV), col="blue", lwd=2) print(paste0("Simio Command: Random.Lognormal(", coef[1], ", ", coef[2], ")"))## [1] "Simio Command: Random.Lognormal(2.1851, 0.7712)"
df2 <- read.xlsx('Problem_Dataset_06_02.xls', 1, header=F)
plotdist(df2$X1, histo = TRUE, demp = TRUE)descdist(df2$X1, boot=500)## summary statistics
## ------
## min: 3.8 max: 36.13
## median: 9.14
## mean: 10.89234
## estimated sd: 6.364957
## estimated skewness: 1.97635
## estimated kurtosis: 7.933002
try <- c("gamma", "weibull", "lnorm", "logis")
GoodnessPlot(df2$X1, try)## Goodness-of-fit statistics
## 1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Kolmogorov-Smirnov statistic 0.09941791 0.1140618 0.06764782
## Cramer-von Mises statistic 0.11444195 0.2008969 0.03973639
## Anderson-Darling statistic 0.71667740 1.3230284 0.26739444
## 4-mle-logis
## Kolmogorov-Smirnov statistic 0.1251506
## Cramer-von Mises statistic 0.1480473
## Anderson-Darling statistic 1.3305012
##
## Goodness-of-fit criteria
## 1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Akaike's Information Criterion 288.3562 296.2684 282.6829
## Bayesian Information Criterion 292.0565 299.9687 286.3832
## 4-mle-logis
## Akaike's Information Criterion 302.0135
## Bayesian Information Criterion 305.7138
It looks like the best option here is again the Lognormal Distribution.
fln <- fitdist(df2$X1, "lnorm")
coef <- round(coef(fln), 4)
RV <- rlnorm(10^5, coef[1], coef[2])
hist(df2$X1, prob=TRUE, col="grey", ylim=c(0, .11), main="Density of Data vs Fit of Chosen Distribution")
lines(density(RV), col="blue", lwd=2) print(paste0("Simio Command: Random.Lognormal(", coef[1], ", ", coef[2], ")"))## [1] "Simio Command: Random.Lognormal(2.2579, 0.4906)"
df3 <- read.xlsx('Problem_Dataset_06_03.xls', 1, header=F)
plotdist(df3$X1, histo = TRUE, demp = TRUE)descdist(df3$X1, boot=500)## summary statistics
## ------
## min: 1 max: 8
## median: 3
## mean: 2.955556
## estimated sd: 1.536755
## estimated skewness: 1.02143
## estimated kurtosis: 4.428947
try <- c("gamma", "weibull", "lnorm")
GoodnessPlot(df3$X1, try, discrete=TRUE)## Chi-squared statistic: 11.36869 6.950712 17.60503
## Degree of freedom of the Chi-squared distribution: 2 2 2
## Chi-squared p-value: 0.003398758 0.03095081 0.0001503542
## the p-value may be wrong with some theoretical counts < 5
## Chi-squared table:
## obscounts theo 1-mle-gamma theo 2-mle-weibull theo 3-mle-lnorm
## <= 1 7 2.362048 3.540233 1.731231
## <= 2 13 10.830084 9.558281 12.535323
## <= 3 11 12.872701 11.605719 13.198144
## <= 4 8 9.295658 9.679546 8.217889
## > 4 6 9.639510 10.616221 9.317412
##
## Goodness-of-fit criteria
## 1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Akaike's Information Criterion 160.2218 162.3428 160.9673
## Bayesian Information Criterion 163.8352 165.9561 164.5806
Here let’s try the Gamma Distribution.
fln <- fitdist(df3$X1, "gamma", discrete=TRUE)
coef <- round(coef(fln), 4)
RV <- rgamma(10^5, coef[1], coef[2])
hist(df3$X1, prob=TRUE, col="grey", ylim=c(0, .44), main="Density of Data vs Fit of Chosen Distribution")
lines(density(RV), col="blue", lwd=2) print(paste0("Simio Command: Random.Gamma(", coef[1], ", ", coef[2], ")"))## [1] "Simio Command: Random.Gamma(3.8536, 1.3038)"